#!/usr/bin/env python
# -*- conding: utf-8 -*-

"""
@Time     : 2024/9/5 6:18
@Author   : liujingmao
@File     : 1.语义分割器使用示例.py
"""

import dotenv
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai import OpenAIEmbeddings
# from langchain_community.document_loaders import UnstructuredFileLoader
from langchain_unstructured import UnstructuredLoader

dotenv.load_dotenv()

loader = UnstructuredLoader("./科幻短篇.txt")

text_splitter = SemanticChunker(
    embeddings=OpenAIEmbeddings(model="text-embedding-3-small"),
    number_of_chunks=10,
    add_start_index=True,
    sentence_split_regex=r"(?<=[。？！.?!])"
)

documents = loader.load()
chunks = text_splitter.split_documents(documents)

for chunk in chunks:
    print(f"块大小:{len(chunk.page_content)},元数据:{chunk.metadata}")
